{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#协同过滤的推荐引擎\n",
    "#比较用户或者物品之间的相似度，进行推荐\n",
    "#note:用户的数量很多时，我们倾向于，“基于物品相似度的计算方法”\n",
    "\n",
    "#物品之间相似度的计算，不是简单的，利用物品的属性进行距离计算，而是\n",
    "#利用用户对他们的意见进行相似度计算————根据“评价矩阵”（用户 i 对物品 j 的评分矩阵）\n",
    "\n",
    "\n",
    "#首先定义几个相似度计算函数\n",
    "\n",
    "import numpy as np\n",
    "#欧式距离计算相似度\n",
    "#这里，我们希望，相似度在0-1之间。所以用公式：相似度 = 1/(1+相似度)\n",
    "def euclidSimilarity(A,B):\n",
    "    #A,B都是列向量,表明是基于物品的相似度\n",
    "    dis = np.linalg.norm(A-B)\n",
    "    return 1.0 / (1.0 + dis) #一定要用1.0\n",
    "\n",
    "#皮尔逊相似系数\n",
    "def pearsonSimilarity(A,B):\n",
    "    if len(A) < 3:\n",
    "        return 1.0 #检查是否含有三个及以上的点，否则两个向量完全相关\n",
    "    return 0.5 + 0.5 * np.corrcoef(A,B,rowvar=0)[0,1] #规范到0-1\n",
    "\n",
    "#余弦相似度\n",
    "def cosSimilarity(A,B):\n",
    "    num = float(A.T.dot(B))\n",
    "    denom = np.linalg.norm(A) * np.linalg.norm(B)\n",
    "    return 0.5 + 0.5 * (num / denom) #规范到0-1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1336766024\n0.237686194076\n0.547245559126\n"
     ]
    }
   ],
   "source": [
    "#构造一个矩阵测试一下\n",
    "\n",
    "#行表示人，列表示物品:所以矩阵的列表示：基于物品的相似度计算\n",
    "a = np.array([[1,1,1,0,0],\n",
    "              [2,2,2,0,0],\n",
    "              [1,1,1,0,0],\n",
    "              [5,5,5,0,0],\n",
    "              [1,1,0,2,2],\n",
    "              [0,0,0,3,3],\n",
    "              [0,0,0,1,1]])\n",
    "\n",
    "\n",
    "S1 = euclidSimilarity(a[:,0],a[:,4])\n",
    "print S1\n",
    "\n",
    "S2 = pearsonSimilarity(a[:,0],a[:,4])\n",
    "print S2\n",
    "\n",
    "S3 = cosSimilarity(a[:,0],a[:,4])\n",
    "print S3\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#根据相似度函数计算评分\n",
    "#uersID 表示需要推荐的用户index,  itemID 表示 此用户没有评级过的物品index\n",
    "def score(data,userID,itemID,simiFunction):\n",
    "    n = data.shape[1] #物品的数量\n",
    "    simTotal = 0.0 #相似度的总和\n",
    "    rateSimTotal = 0.0 #评级的总和\n",
    "    #对每一个物品进行遍历，\n",
    "    for j in xrange(n):\n",
    "        userRating = data[userID,j] #用户已经评级的物品分数\n",
    "        similarity = 0\n",
    "        if userRating==0:\n",
    "            continue #分数为0，表示没有评过级,所以j表示评级的物品index\n",
    "        #寻找某一个物品J，跟物品itemID,他们都被评级的下标,只要下标，所以有一个[0]\n",
    "        bothRateIndex = np.nonzero(np.logical_and(data[:,itemID]>0,data[:,j]>0))[0]\n",
    "        if len(bothRateIndex)==0:\n",
    "            similarity = 0\n",
    "        else:\n",
    "            #基于这些都评级的数字，进行两个物体的相似度计算\n",
    "            similarity = simiFunction(data[bothRateIndex,itemID],data[bothRateIndex,j])\n",
    "        simTotal += similarity\n",
    "        rateSimTotal += similarity * userRating  #itemID的评级 += 与j的相似度 * j的评级 (相当于加权平均)\n",
    "    if simTotal == 0:\n",
    "        return 0\n",
    "    else:\n",
    "        return rateSimTotal / simTotal #(评级在0-5之间) \n",
    "        #相当于加权平均——average_score = score * weight / sum(weight)\n",
    "    \n",
    "def recommend(data,userID,favorite = 3,simiFunction = cosSimilarity):\n",
    "    unratedItems = np.nonzero(data[userID,:] == 0)[0]  #没有评级的index\n",
    "    if len(unratedItems) == 0:\n",
    "        return \"you have rated everything\"\n",
    "    itemScores = []\n",
    "    for itemID in unratedItems:\n",
    "        itemScore = score(data,userID,itemID,simiFunction) #返回评估的分数\n",
    "        itemScores.append((itemID,itemScore)) \n",
    "    return sorted(itemScores,key=lambda x:x[1],reverse=True)[:favorite]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(2, 2.5), (1, 2.0243290220056256)]\n"
     ]
    }
   ],
   "source": [
    "#设置一个矩阵\n",
    "data = np.array([[4,4,0,2,2],\n",
    "                 [4,0,0,3,3],\n",
    "                 [4,0,0,1,1],\n",
    "                 [1,1,1,2,0],\n",
    "                 [2,2,2,0,0],\n",
    "                 [1,1,1,0,0],\n",
    "                 [5,5,5,0,0]])\n",
    "\n",
    "recommendation = recommend(data,2)\n",
    "print recommendation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3 4 5 6]\n[1 2]\n[4 4]\n"
     ]
    }
   ],
   "source": [
    "#test一下numpy的函数\n",
    "data = np.array([[4,4,0,2,2],\n",
    "                 [4,0,0,3,3],\n",
    "                 [4,0,0,1,1],\n",
    "                 [1,1,1,2,0],\n",
    "                 [2,2,2,0,0],\n",
    "                 [1,1,1,0,0],\n",
    "                 [5,5,5,0,0]])\n",
    "print np.nonzero(np.logical_and(data[:,1]>0,data[:,2]>0))[0]\n",
    "\n",
    "print np.nonzero(data[2,:] == 0)[0]\n",
    "\n",
    "print data[[1,2],0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
